/*********************************************************************************

         Description: Going from Raw Data to Collapsed Data

         This do-file takes raw stacked unanonymized census data, 
         merges with cousin marriage data (by surname), cousin marriage
         ban assignment and creates the final collapsed data at the 
         birthstate-birthdecade-residencestate-censusyear-cm (high versus low)
         level on which the final programs are run.
	
*********************************************************************************/

clear all

** Load cousin marriage data (Created using Create_CM.do) **

use "$dir/Data/Final/Isonymy_fs.dta"

keep if pre==1

collapse (mean) IsoNonrand_f = IsoNonrand_fs IsoRand_f = IsoRand_fs IsoObs_f = IsoObs_fs (rawsum) N_f = N_fs  Nm_f = Nm_fs  Nf_f = Nf_fs  [fw=N_fs] , by(surname)

gen cm = max(IsoNonrand_f, 0) * 4
gen cm_obs = max(IsoObs_f, 0) * 4

**************************************************
***** NYSIIS surname-rates of cousin marriage ****
**************************************************

* nysiis code for each surname *
nysiis surname, gen(surname_nysiis)
egen N_tot = sum(N_f), by(surname_nysiis)
gen share_nysiis = N_f/N_tot
egen cm_nysiis = sum(cm*share_nysiis), by(surname_nysiis)
drop surname_nysiis N_tot share_nysiis

sort surname
gen cmH = cm >= 0.1 /* High cousin marriage surname, based on pre-period country wide
rates (this needs to be adjusted for robustness checks, based on observed cm, nysiis, or different thresholds. */

label var IsoNonrand_f "Non-random isonymy"
label var IsoRand_f "Random Isonymy"
label var IsoObs_f "Observed isonymy"
label var cm "Cousin marriage rate"
label var cm_obs "Cousin marriage rate (observed isonymy)"
label var cm_nysiis "Cousin marriage rate (NYSIIS)"
label var cmH ">10% cousin marriage rate (-1858)"
label var N_f "Number of individuals"
label var Nm_f "Number of grooms"
label var Nf_f "Number of brides"

* Merge census data with all relevant variables **
merge 1:m surname using "$dir/Data/Original/census_data_uncollapsed.dta"
drop _merge

** Merge with year of ban (treatment) based on own birthplace **
merge m:1 state using "Data/Original/year_of_ban.dta"
drop _merge
sort year histid

label var birthdecade "Birth decade"
label var birthstatefip "Birth state"
label var resstatefip "Residence state"
label var year "Census year"
label var count_dor "Num census individuals in cell" /* this is simply a column of 1s in the uncollapsed data */

label var occscore "Occscore"
label var inchat "LIDO Income (Original)"
label var citypop "City size"
label var urban "Urban"
label var farm "Residence: Farm"
label var mig_life "Interstate Migration"
label var mig_life_noban "Interstate Migration (State without ban)"
label var mig_life_ban "Interstate Migration (State with ban)"
label var divorced "Divorced"
label var mult_genh "Multigenerational HH"
label var nchild "No. of resident children"
label var nchlt5 "No. of resident children (under 5)"
label var agemarr "Age of Marriage"
label var durmarr "Duration of Current Marriage"
label var ncouples "No. of couples in unit"
label var ncouples_wo "No. of couples in unit (w/o own)"
label var nfams "No. of unrelated families in unit"
label var famsize "Family Size"
label var nsibs "No. of siblings in unit"
label var nm_single "Never Married/Single"
label var occ_disp_hhi "Occupational Dispersion HHI"
label var occ_disp_hhi_10digit "Occupational Dispersion HHI (10-digit codes)"
label var occ_disp_hhi_occ_class "Occupational Dispersion HHI (Farmer, White Collar, Blue Collar, Self-Employed)"
label var prank "Income percentile-rank"
label var geog_disp_hhi_state "Geographical Dispersion (State)"
label var geog_disp_hhi_county "Geographical Dispersion (County)"
label var geog_disp_hhi_enumdist "Geographical Dispersion (Enumeration District)"
label var gq_inst "Institutionalized"
label var gq_inst_med "Institutionalized (Medical)"
label var gq_poorhouse "Institutionalized (Poorhouse)"
label var hmem_disable "Household member (Disabled)"
label var blind_deaf "Blind/Deaf"
label var genetic_d "Institutionalized (Medical) + Disability"
label var share_ch_cmn "Share childr. with common names"

label var age_bin "Age Bin"
label var high1850_farm "High vs Low Farm Status (1850)"
label var high1850_realprop "High vs Low Real-Estate Wealth (1850)"


/*

For various robustness checks in the paper, the treatment assignment (year_of_ban) and cousin marriage rates need to appropriately merged with the census data. This code is
for the main results in the paper where:

(1) Cousin marriage rates are at the surname-level
(2) year_of_ban is assigned on the basis of an individual's state of birth. 

Below is a set of instructions for the different robustness checks.

(1) Cousin marriage rates 
a. CM rates based on observed isonymy (cm_obs) -- For this simply define the cmH dummy (line 40) based on  cm_obs and not cm. 

b. CM rates based on NYSIIS surnames -- For this simply define the cmH dummy (line 40) based on cm_nysiis and not cm. 

c. Cousin marriage rates (different thresholds) -- For this simply define the cmH dummy (line 40) based on different values of cm (i.e. 0.08 and 0.12). 

d. CM rates at the surname-state level -- These data are directly available in Isonymy_fs_preperiod.dta. Merge with census data using surname *and* birth state and use the cmH dummy directly (do not collapse at surname-level). 


(2) Treatment Assignment Based on Father's State of Birth 
a. For this, also use year_of_ban.dta, but instead of merging with an indiviual's own state of birth, merge with their father's state of birth.

*/



*************************************************************************************************************************
** Generating variables to appropriately weight observations in collapsed data based on non-missing outcome occurences **
*************************************************************************************************************************

local outcomes inchat  /// 
     mig_life mig_life_noban mig_life_ban farm urban ///
	 mult_genh divorced nchild nchlt5 nsibs agemarr married ///
	 nfams ncouples ncouples_wo famsize ///
	 gq_inst gq_inst_med gq_poorhouse hmem_disable blind_deaf genetic_d ///

	 
	 foreach outcome in `outcomes'{
	 gen 	`outcome'_nm = 	`outcome' != .
	 }
	 
** Outcomes that will eventually be logged (after collapsing) **
gen ln_occscore_nm = occscore != .
gen ln_citypop_nm = citypop != .

foreach v of varlist(inchat_nm-ln_citypop_nm){
	label var `v' "Number of individuals with non-missing outcome in cell"
}


****************************************************************************************	 
** Collapsing at the birthdecade-birthstatefip-year_of_ban-resstatefip-year-cmH level **
****************************************************************************************	 

preserve

collapse ///
	(mean)  occscore prank inchat inchat_levels urban farm  ///
	mig_life mig_life_noban mig_life_ban citypop mult_genh ///
	nchild nchlt5 share_ch_cmn  nsibs divorced nm_single agemarr durmarr /// 
	nfams ncouples ncouples_wo famsize occ_disp_hhi occ_disp_hhi_10digit occ_disp_hhi_occ_class  ///
	geog_disp_hhi_county geog_disp_hhi_state geog_disp_hhi_enumdist ///
	gq_inst gq_inst_med gq_poorhouse hmem_disable blind_deaf genetic_d ///
	(rawsum) count_dor = count_dor ln_occscore_nm = ln_occscore_nm inchat_nm = inchat_nm ln_citypop_nm = ln_citypop_nm ///
	mig_life_nm = mig_life_nm mig_life_ban_nm = mig_life_ban_nm mig_life_noban_nm = mig_life_noban_nm farm_nm = farm_nm urban_nm = urban_nm ///
	mult_genh_nm = mult_genh_nm divorced_nm = divorced_nm nchild_nm = nchild_nm ///
	nchlt5_nm = nchlt5_nm nsibs_nm = nsibs_nm agemarr_nm = agemarr_nm nfams_nm = nfams_nm ///
	ncouples_nm = ncouples_nm ncouples_wo_nm = ncouples_wo_nm  famsize_nm = famsize_nm ///
	gq_inst_nm = gq_inst_nm gq_inst_med_nm = gq_inst_med_nm ///
	blind_deaf_nm = blind_deaf_nm hmem_disable_nm = hmem_disable_nm genetic_d_nm = genetic_d_nm  ///
	[aweight = count_dor], ///
	by(birthdecade birthstatefip year_of_ban resstatefip year cmH) 

	
** Save cleaned, collapsed data for analysis **	
save "$dir/Data/Final/final_collapsed_clean.dta", replace

restore 

/*

1. To run the models with cousin marriage rates calculated at differently, such as using using observed isonymy/nysiis surnames/surmame-state-rates/different thresholds etc., the code above can exactly be followed. Of course the high-cousin-marriage dummy cmH, would be based on the cm rate being used. It would be useful to save the final collapsed data with a different name (as we have in the current Data/Final folder:
a. final_collapsed_surnamestatecm.dta
b. final_collapsed_nysiis.dta
c. final_collapsed_obsiso.dta
d. final_collapsed_thresh8.dta
e. final_collapsed_thresh12.dta)

2. To run models where year_of_ban is assigned differently, such as using father's birthplace (rather than own birthplace), the code again would exactly be the same as above, but the year_of_ban variable would be merged using one's father's birthstate (FBPL).
The collapsed dataset in the Data/Final folder is:
a. final_collapsed_fbpl.dta 


3. Apart from the two points above, we collapse the data at different levels, when anaylzing heterogeneous effects: such as by age, marital status, baseline realestate wealth or surname commonnness. In particular, in these cases we additionally collapse at the level at which heterogeneity is being analyzed. The code needs to be adjusted at the collapsed stage. We have provided the code for each case below. The relevant datasets are saved in the Data/Final folder.

*/


*********************************************************
***** To Assess Heterogeneity by Real-Estate Wealth *****
*********************************************************

preserve

collapse ///
	(mean) occscore  ///
	 citypop ///
	(rawsum) count_dor = count_dor ln_occscore_nm = ln_occscore_nm ///
	ln_citypop_nm = ln_citypop_nm ///
	[aweight = count_dor], ///
	by(birthdecade birthstatefip year_of_ban resstatefip year cmH high1850_farm high1850_realprop) 
	
save "$dir/Data/Final/final_collapsed_flhocc.dta", replace

restore

********************************************************
**** To Assess Heterogeneity by Surname Commonness *****
********************************************************

preserve 

egen count_surname = count(surname), by(surname)
sort count_surname
xtile scom_quartile = count_surname, nq(4)


collapse ///
     (mean) occscore ///
	 farm urban citypop ///
	 (rawsum) count_dor = count_dor ln_occscore_nm = ln_occscore_nm ln_citypop_nm = ln_citypop_nm ///
	 [aweight = count_dor], ///
	by(birthdecade birthstatefip year_of_ban resstatefip year scom_quartile cmH)
	
save "$dir/Data/Final/final_collapsed_scommonq.dta", replace

restore

***********************************************************
**** To Assess Heterogeneity by age and marital status ****
***********************************************************

preserve

collapse ///
     (mean) occscore ///
	 farm urban citypop ///
	 (rawsum) count_dor = count_dor ln_occscore_nm = ln_occscore_nm ln_citypop_nm = ln_citypop_nm ///
	 [aweight = count_dor], ///
	by(birthdecade birthstatefip year_of_ban resstatefip year age_bin nm_single cmH)
	
save "$dir/Data/Final/final_collapsed_ageb_ms.dta", replace

restore


***********************************************************
********* To Control for Surname Fixed Effects ************
***********************************************************

preserve

collapse ///
     (mean) occscore ///
	 farm urban citypop ///
	 (rawsum) count_dor = count_dor ln_occscore_nm = ln_occscore_nm ln_citypop_nm = ln_citypop_nm ///
	 [aweight = count_dor], ///
	 by(surname birthdecade birthstatefip state year_of_ban resstatefip year cmH) 
	
save "$dir/Data/Final/final_collapsed_SFE.dta", replace

restore


********************************************************************
** Control for Surname Fixed Effects and Surname-state group size **
****** (Use this when cm rates are the surname-state-level) ********
********************************************************************

preserve

collapse ///
     (mean) occscore urban farm  ///
	  citypop  N_fs ///
	 (rawsum) count_dor = count_dor ln_occscore_nm = ln_occscore_nm ln_citypop_nm = ln_citypop_nm ///
	 [aweight = count_dor], ///
	by(surname birthdecade birthstatefip year_of_ban resstatefip year cmH) 

	save "$dir/Data/Final/final_collapsed_surnamestatecm_sz.dta", replace

restore
